# Author: Stephen Situ
# This is a practice project on cleaning data using panda dataframes in python on jupyter
# The original dataset can be found here: https://www.kaggle.com/datasets/gagandeep16/car-sales
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib
# Read Data
car_sales_data = pd.read_csv("Car_sales.csv")
# Shape of Data
car_sales_data.shape
(157, 16)
# Variable Types
car_sales_data.info
<bound method DataFrame.info of Manufacturer Model Sales_in_thousands __year_resale_value \ 0 Acura Integra 16.919 16.360 1 Acura TL 39.384 19.875 2 Acura CL 14.114 18.225 3 Acura RL 8.588 29.725 4 Audi A4 20.397 22.255 .. ... ... ... ... 152 Volvo V40 3.545 NaN 153 Volvo S70 15.245 NaN 154 Volvo V70 17.531 NaN 155 Volvo C70 3.493 NaN 156 Volvo S80 18.969 NaN Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase \ 0 Passenger 21.50 1.8 140.0 101.2 1 Passenger 28.40 3.2 225.0 108.1 2 Passenger NaN 3.2 225.0 106.9 3 Passenger 42.00 3.5 210.0 114.6 4 Passenger 23.99 1.8 150.0 102.6 .. ... ... ... ... ... 152 Passenger 24.40 1.9 160.0 100.5 153 Passenger 27.50 2.4 168.0 104.9 154 Passenger 28.80 2.4 168.0 104.9 155 Passenger 45.50 2.3 236.0 104.9 156 Passenger 36.00 2.9 201.0 109.9 Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch \ 0 67.3 172.4 2.639 13.2 28.0 2/2/2012 1 70.3 192.9 3.517 17.2 25.0 6/3/2011 2 70.6 192.0 3.470 17.2 26.0 1/4/2012 3 71.4 196.6 3.850 18.0 22.0 3/10/2011 4 68.2 178.0 2.998 16.4 27.0 10/8/2011 .. ... ... ... ... ... ... 152 67.6 176.6 3.042 15.8 25.0 9/21/2011 153 69.3 185.9 3.208 17.9 25.0 11/24/2012 154 69.3 186.2 3.259 17.9 25.0 6/25/2011 155 71.5 185.7 3.601 18.5 23.0 4/26/2011 156 72.1 189.8 3.600 21.1 24.0 11/14/2011 Power_perf_factor 0 58.280150 1 91.370778 2 NaN 3 91.389779 4 62.777639 .. ... 152 66.498812 153 70.654495 154 71.155978 155 101.623357 156 85.735655 [157 rows x 16 columns]>
# DataFrame Heads
car_sales_data.head(5)
Manufacturer | Model | Sales_in_thousands | __year_resale_value | Vehicle_type | Price_in_thousands | Engine_size | Horsepower | Wheelbase | Width | Length | Curb_weight | Fuel_capacity | Fuel_efficiency | Latest_Launch | Power_perf_factor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Acura | Integra | 16.919 | 16.360 | Passenger | 21.50 | 1.8 | 140.0 | 101.2 | 67.3 | 172.4 | 2.639 | 13.2 | 28.0 | 2/2/2012 | 58.280150 |
1 | Acura | TL | 39.384 | 19.875 | Passenger | 28.40 | 3.2 | 225.0 | 108.1 | 70.3 | 192.9 | 3.517 | 17.2 | 25.0 | 6/3/2011 | 91.370778 |
2 | Acura | CL | 14.114 | 18.225 | Passenger | NaN | 3.2 | 225.0 | 106.9 | 70.6 | 192.0 | 3.470 | 17.2 | 26.0 | 1/4/2012 | NaN |
3 | Acura | RL | 8.588 | 29.725 | Passenger | 42.00 | 3.5 | 210.0 | 114.6 | 71.4 | 196.6 | 3.850 | 18.0 | 22.0 | 3/10/2011 | 91.389779 |
4 | Audi | A4 | 20.397 | 22.255 | Passenger | 23.99 | 1.8 | 150.0 | 102.6 | 68.2 | 178.0 | 2.998 | 16.4 | 27.0 | 10/8/2011 | 62.777639 |
# DataFrame Tails
car_sales_data.tail(5)
Manufacturer | Model | Sales_in_thousands | __year_resale_value | Vehicle_type | Price_in_thousands | Engine_size | Horsepower | Wheelbase | Width | Length | Curb_weight | Fuel_capacity | Fuel_efficiency | Latest_Launch | Power_perf_factor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
152 | Volvo | V40 | 3.545 | NaN | Passenger | 24.4 | 1.9 | 160.0 | 100.5 | 67.6 | 176.6 | 3.042 | 15.8 | 25.0 | 9/21/2011 | 66.498812 |
153 | Volvo | S70 | 15.245 | NaN | Passenger | 27.5 | 2.4 | 168.0 | 104.9 | 69.3 | 185.9 | 3.208 | 17.9 | 25.0 | 11/24/2012 | 70.654495 |
154 | Volvo | V70 | 17.531 | NaN | Passenger | 28.8 | 2.4 | 168.0 | 104.9 | 69.3 | 186.2 | 3.259 | 17.9 | 25.0 | 6/25/2011 | 71.155978 |
155 | Volvo | C70 | 3.493 | NaN | Passenger | 45.5 | 2.3 | 236.0 | 104.9 | 71.5 | 185.7 | 3.601 | 18.5 | 23.0 | 4/26/2011 | 101.623357 |
156 | Volvo | S80 | 18.969 | NaN | Passenger | 36.0 | 2.9 | 201.0 | 109.9 | 72.1 | 189.8 | 3.600 | 21.1 | 24.0 | 11/14/2011 | 85.735655 |
# Summary / Descriptive Statistics
car_sales_data.describe
<bound method NDFrame.describe of Manufacturer Model Sales_in_thousands __year_resale_value \ 0 Acura Integra 16.919 16.360 1 Acura TL 39.384 19.875 2 Acura CL 14.114 18.225 3 Acura RL 8.588 29.725 4 Audi A4 20.397 22.255 .. ... ... ... ... 152 Volvo V40 3.545 NaN 153 Volvo S70 15.245 NaN 154 Volvo V70 17.531 NaN 155 Volvo C70 3.493 NaN 156 Volvo S80 18.969 NaN Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase \ 0 Passenger 21.50 1.8 140.0 101.2 1 Passenger 28.40 3.2 225.0 108.1 2 Passenger NaN 3.2 225.0 106.9 3 Passenger 42.00 3.5 210.0 114.6 4 Passenger 23.99 1.8 150.0 102.6 .. ... ... ... ... ... 152 Passenger 24.40 1.9 160.0 100.5 153 Passenger 27.50 2.4 168.0 104.9 154 Passenger 28.80 2.4 168.0 104.9 155 Passenger 45.50 2.3 236.0 104.9 156 Passenger 36.00 2.9 201.0 109.9 Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch \ 0 67.3 172.4 2.639 13.2 28.0 2/2/2012 1 70.3 192.9 3.517 17.2 25.0 6/3/2011 2 70.6 192.0 3.470 17.2 26.0 1/4/2012 3 71.4 196.6 3.850 18.0 22.0 3/10/2011 4 68.2 178.0 2.998 16.4 27.0 10/8/2011 .. ... ... ... ... ... ... 152 67.6 176.6 3.042 15.8 25.0 9/21/2011 153 69.3 185.9 3.208 17.9 25.0 11/24/2012 154 69.3 186.2 3.259 17.9 25.0 6/25/2011 155 71.5 185.7 3.601 18.5 23.0 4/26/2011 156 72.1 189.8 3.600 21.1 24.0 11/14/2011 Power_perf_factor 0 58.280150 1 91.370778 2 NaN 3 91.389779 4 62.777639 .. ... 152 66.498812 153 70.654495 154 71.155978 155 101.623357 156 85.735655 [157 rows x 16 columns]>
# Check Data Types
car_sales_data.dtypes
Manufacturer object Model object Sales_in_thousands float64 __year_resale_value float64 Vehicle_type object Price_in_thousands float64 Engine_size float64 Horsepower float64 Wheelbase float64 Width float64 Length float64 Curb_weight float64 Fuel_capacity float64 Fuel_efficiency float64 Latest_Launch object Power_perf_factor float64 dtype: object
# Create dataframe of only categorical columns
categorical = car_sales_data.dtypes[car_sales_data.dtypes == "object"].index
print(categorical)
car_sales_data[categorical]
Index(['Manufacturer', 'Model', 'Vehicle_type', 'Latest_Launch'], dtype='object')
Manufacturer | Model | Vehicle_type | Latest_Launch | |
---|---|---|---|---|
0 | Acura | Integra | Passenger | 2/2/2012 |
1 | Acura | TL | Passenger | 6/3/2011 |
2 | Acura | CL | Passenger | 1/4/2012 |
3 | Acura | RL | Passenger | 3/10/2011 |
4 | Audi | A4 | Passenger | 10/8/2011 |
... | ... | ... | ... | ... |
152 | Volvo | V40 | Passenger | 9/21/2011 |
153 | Volvo | S70 | Passenger | 11/24/2012 |
154 | Volvo | V70 | Passenger | 6/25/2011 |
155 | Volvo | C70 | Passenger | 4/26/2011 |
156 | Volvo | S80 | Passenger | 11/14/2011 |
157 rows × 4 columns
# Sorting the first 15 items in Model
sorted(car_sales_data["Model"])[0:15]
['3-Sep', '3000GT', '300M', '323i', '328i', '4Runner', '5-Sep', '528i', 'A4', 'A6', 'A8', 'Accent', 'Accord', 'Alero', 'Altima']
# Describe only Manufacterer column
car_sales_data["Manufacturer"].describe
<bound method NDFrame.describe of 0 Acura 1 Acura 2 Acura 3 Acura 4 Audi ... 152 Volvo 153 Volvo 154 Volvo 155 Volvo 156 Volvo Name: Manufacturer, Length: 157, dtype: object>
# Check Unique Values in Model
car_sales_data["Model"].unique()
array(['Integra', 'TL', 'CL', 'RL', 'A4', 'A6', 'A8', '323i', '328i', '528i', 'Century', 'Regal', 'Park Avenue', 'LeSabre', 'DeVille', 'Seville', 'Eldorado', 'Catera', 'Escalade', 'Cavalier', 'Malibu', 'Lumina', 'Monte Carlo', 'Camaro', 'Corvette', 'Prizm', 'Metro', 'Impala', 'Sebring Coupe', 'Sebring Conv.', 'Concorde', 'Cirrus', 'LHS', 'Town & Country', '300M', 'Neon', 'Avenger', 'Stratus', 'Intrepid', 'Viper', 'Ram Pickup', 'Ram Wagon', 'Ram Van', 'Dakota', 'Durango', 'Caravan', 'Escort', 'Mustang', 'Contour', 'Taurus', 'Focus', 'Crown Victoria', 'Explorer', 'Windstar', 'Expedition', 'Ranger', 'F-Series', 'Civic', 'Accord', 'CR-V', 'Passport', 'Odyssey', 'Accent', 'Elantra', 'Sonata', 'I30', 'S-Type', 'Wrangler', 'Cherokee', 'Grand Cherokee', 'ES300', 'GS300', 'GS400', 'LS400', 'LX470', 'RX300', 'Continental', 'Town car', 'Navigator', 'Mirage', 'Eclipse', 'Galant', 'Diamante', '3000GT', 'Montero', 'Montero Sport', 'Mystique', 'Cougar', 'Sable', 'Grand Marquis', 'Mountaineer', 'Villager', 'C-Class', 'E-Class', 'S-Class', 'SL-Class', 'SLK', 'SLK230', 'CLK Coupe', 'CL500', 'M-Class', 'Sentra', 'Altima', 'Maxima', 'Quest', 'Pathfinder', 'Xterra', 'Frontier', 'Cutlass', 'Intrigue', 'Alero', 'Aurora', 'Bravada', 'Silhouette', 'Breeze', 'Voyager', 'Prowler', 'Sunfire', 'Grand Am', 'Firebird', 'Grand Prix', 'Bonneville', 'Montana', 'Boxter', 'Carrera Coupe', 'Carrera Cabrio', '5-Sep', '3-Sep', 'SL', 'SC', 'SW', 'LW', 'LS', 'Outback', 'Forester', 'Corolla', 'Camry', 'Avalon', 'Celica', 'Tacoma', 'Sienna', 'RAV4', '4Runner', 'Land Cruiser', 'Golf', 'Jetta', 'Passat', 'Cabrio', 'GTI', 'Beetle', 'S40', 'V40', 'S70', 'V70', 'C70', 'S80'], dtype=object)
# Find Nulls
nan_rows = car_sales_data[car_sales_data.isna().any(axis=1)]
print(nan_rows)
nan_rows.index
Manufacturer Model Sales_in_thousands __year_resale_value \ 2 Acura CL 14.114 18.225 7 BMW 323i 19.747 NaN 15 Cadillac Seville 15.943 27.100 18 Cadillac Escalade 14.785 NaN 27 Chevrolet Impala 107.995 NaN 33 Chrysler Town & Country 53.480 19.540 34 Chrysler 300M 30.696 NaN 38 Dodge Intrepid 88.028 12.275 44 Dodge Durango 101.323 NaN 50 Ford Focus 175.670 NaN 66 Jaguar S-Type 15.467 NaN 72 Lexus GS400 3.334 NaN 74 Lexus LX470 9.126 NaN 75 Lexus RX300 51.238 NaN 78 Lincoln Navigator 22.925 NaN 96 Mercedes-B SLK 7.998 NaN 97 Mercedes-B SLK230 1.526 NaN 98 Mercedes-B CLK Coupe 11.592 NaN 99 Mercedes-B CL500 0.954 NaN 100 Mercedes-B M-Class 28.976 NaN 106 Nissan Xterra 54.158 NaN 107 Nissan Frontier 65.005 NaN 109 Oldsmobile Intrigue 38.554 NaN 110 Oldsmobile Alero 80.255 NaN 117 Plymouth Prowler 1.872 NaN 123 Pontiac Montana 39.572 NaN 127 Saab 5-Sep 9.191 NaN 128 Saab 3-Sep 12.115 NaN 132 Saturn LW 8.472 NaN 133 Saturn LS 49.989 NaN 134 Subaru Outback 47.107 NaN 135 Subaru Forester 33.028 NaN 141 Toyota Sienna 65.119 NaN 150 Volkswagen Beetle 49.463 NaN 151 Volvo S40 16.957 NaN 152 Volvo V40 3.545 NaN 153 Volvo S70 15.245 NaN 154 Volvo V70 17.531 NaN 155 Volvo C70 3.493 NaN 156 Volvo S80 18.969 NaN Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase \ 2 Passenger NaN 3.2 225.0 106.9 7 Passenger 26.990 2.5 170.0 107.3 15 Passenger 44.475 4.6 275.0 112.2 18 Car 46.225 5.7 255.0 117.5 27 Passenger 18.890 3.4 180.0 110.5 33 Car NaN NaN NaN NaN 34 Passenger 29.185 3.5 253.0 113.0 38 Passenger 22.505 2.7 202.0 113.0 44 Car 26.310 5.2 230.0 115.7 50 Passenger 12.315 2.0 107.0 103.0 66 Passenger 42.800 3.0 240.0 114.5 72 Passenger 46.305 4.0 300.0 110.2 74 Car 60.105 4.7 230.0 112.2 75 Car 34.605 3.0 220.0 103.0 78 Car 42.660 5.4 300.0 119.0 96 Passenger 38.900 2.3 190.0 94.5 97 Passenger 41.000 2.3 185.0 94.5 98 Passenger 41.600 3.2 215.0 105.9 99 Passenger 85.500 5.0 302.0 113.6 100 Car 35.300 3.2 215.0 111.0 106 Car 22.799 3.3 170.0 104.3 107 Car 17.890 3.3 170.0 116.1 109 Passenger 24.150 3.5 215.0 109.0 110 Passenger 18.270 2.4 150.0 107.0 117 Passenger 43.000 3.5 253.0 113.3 123 Car 25.635 3.4 185.0 120.0 127 Passenger 33.120 2.3 170.0 106.4 128 Passenger 26.100 2.0 185.0 102.6 132 Passenger 18.835 2.2 137.0 106.5 133 Passenger 15.010 2.2 137.0 106.5 134 Passenger 22.695 2.5 165.0 103.5 135 Car 20.095 2.5 165.0 99.4 141 Car 22.368 3.0 194.0 114.2 150 Passenger 15.900 2.0 115.0 98.9 151 Passenger 23.400 1.9 160.0 100.5 152 Passenger 24.400 1.9 160.0 100.5 153 Passenger 27.500 2.4 168.0 104.9 154 Passenger 28.800 2.4 168.0 104.9 155 Passenger 45.500 2.3 236.0 104.9 156 Passenger 36.000 2.9 201.0 109.9 Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch \ 2 70.6 192.0 3.470 17.2 26.0 1/4/2012 7 68.4 176.0 3.179 16.6 26.0 6/28/2011 15 75.0 201.0 NaN 18.5 22.0 4/29/2011 18 77.0 201.2 5.572 30.0 15.0 4/17/2012 27 73.0 200.0 3.389 17.0 27.0 6/18/2011 33 NaN NaN NaN NaN NaN 7/13/2011 34 74.4 197.8 3.567 17.0 23.0 2/10/2012 38 74.7 203.7 3.489 17.0 NaN 6/2/2012 44 71.7 193.5 4.394 25.0 17.0 6/27/2012 50 66.9 174.8 2.564 13.2 30.0 7/22/2012 66 71.6 191.3 3.650 18.4 21.0 11/3/2012 72 70.9 189.2 3.693 19.8 21.0 11/28/2012 74 76.4 192.5 5.401 25.4 15.0 10/30/2012 75 71.5 180.1 3.900 17.2 21.0 1/4/2012 78 79.9 204.8 5.393 30.0 15.0 12/23/2012 96 67.5 157.9 3.055 15.9 26.0 1/16/2011 97 67.5 157.3 2.975 14.0 27.0 8/6/2011 98 67.8 180.3 3.213 16.4 26.0 7/8/2011 99 73.1 196.6 4.115 23.2 20.0 4/11/2011 100 72.2 180.6 4.387 19.0 20.0 2/10/2011 106 70.4 178.0 3.821 19.4 18.0 1/24/2011 107 66.5 196.1 3.217 19.4 18.0 8/27/2011 109 73.6 195.9 3.455 18.0 NaN 4/1/2011 110 70.1 186.7 2.958 15.0 27.0 10/20/2009 117 76.3 165.4 2.850 12.0 21.0 6/27/2012 123 72.7 201.3 3.942 25.0 23.0 7/22/2012 127 70.6 189.2 3.280 18.5 23.0 11/9/2012 128 67.4 182.2 2.990 16.9 23.0 6/12/2011 132 69.0 190.4 3.075 13.1 27.0 8/5/2011 133 69.0 190.4 2.910 13.1 28.0 12/4/2012 134 67.5 185.8 3.415 16.9 25.0 7/7/2011 135 68.3 175.2 3.125 15.9 24.0 9/10/2012 141 73.4 193.5 3.759 20.9 22.0 10/5/2012 150 67.9 161.1 2.769 14.5 26.0 10/20/2011 151 67.6 176.6 2.998 15.8 25.0 2/18/2011 152 67.6 176.6 3.042 15.8 25.0 9/21/2011 153 69.3 185.9 3.208 17.9 25.0 11/24/2012 154 69.3 186.2 3.259 17.9 25.0 6/25/2011 155 71.5 185.7 3.601 18.5 23.0 4/26/2011 156 72.1 189.8 3.600 21.1 24.0 11/14/2011 Power_perf_factor 2 NaN 7 71.191207 15 115.621358 18 109.509117 27 71.838039 33 NaN 34 101.655244 38 80.831470 44 92.854125 50 43.117132 66 102.178985 72 125.013357 74 105.760458 75 91.943802 78 123.972047 96 82.807362 97 81.848969 98 92.925792 99 141.100985 100 90.495532 106 69.782944 107 67.889271 109 86.272523 110 60.727447 117 106.984456 123 76.208440 127 73.503778 128 76.023048 132 56.295243 133 54.819728 134 67.765908 135 66.762943 141 78.027219 150 47.329632 151 66.113057 152 66.498812 153 70.654495 154 71.155978 155 101.623357 156 85.735655
Int64Index([ 2, 7, 15, 18, 27, 33, 34, 38, 44, 50, 66, 72, 74, 75, 78, 96, 97, 98, 99, 100, 106, 107, 109, 110, 117, 123, 127, 128, 132, 133, 134, 135, 141, 150, 151, 152, 153, 154, 155, 156], dtype='int64')
# Remove any rows with Nulls
car_sales_clean = car_sales_data.drop(labels=nan_rows.index,axis=0,inplace=False)
# Find Duplicates
duplicate_rows = car_sales_clean[car_sales_clean.duplicated(keep='last')]
print(duplicate_rows)
Empty DataFrame Columns: [Manufacturer, Model, Sales_in_thousands, __year_resale_value, Vehicle_type, Price_in_thousands, Engine_size, Horsepower, Wheelbase, Width, Length, Curb_weight, Fuel_capacity, Fuel_efficiency, Latest_Launch, Power_perf_factor] Index: []
# Find Null in a column
car_sales_clean[car_sales_clean[['Horsepower']].isna().any(axis=1)]
Manufacturer | Model | Sales_in_thousands | __year_resale_value | Vehicle_type | Price_in_thousands | Engine_size | Horsepower | Wheelbase | Width | Length | Curb_weight | Fuel_capacity | Fuel_efficiency | Latest_Launch | Power_perf_factor |
---|
# Group by Manufacturer / Model and Sales
car_sales_clean.groupby(['Manufacturer','Model'])['Sales_in_thousands'].sum()
Manufacturer Model Acura Integra 16.919 RL 8.588 TL 39.384 Audi A4 20.397 A6 18.780 ... Volkswagen Cabrio 9.569 GTI 5.596 Golf 9.761 Jetta 83.721 Passat 51.102 Name: Sales_in_thousands, Length: 117, dtype: float64
# Save Cleaned Csv
car_sales_clean.to_csv('car_sales_clean.csv')